In [13]:
import nltk.data
from bs4 import BeautifulSoup
import operator
from nltk.probability import *
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
import string
import re
# Idea: get the most common words (done), represent which deputees have these words in common
# or which deputees talk the most of certain topics
In [16]:
# a set containing the deputees' names
deputes_list = set()
# this will contain the deputees' speeches (the key being their name)
deputes_paroles = {}
# this will contain their titles (for exemple: Conseiller d'Etat)
deputes_titres = {}
# We compile the regular expression used to remove the name and title from the beginning of each speech
p = re.compile("\.")
def open_memorial(filepath):
f = open(filepath, 'r')
html = f.read()
doc = BeautifulSoup(html, 'html.parser')
memos = doc.select('.txtMemo p')
for memo in memos:
depute = memo.select('a.style_pdg_gras')
if depute:
nom_depute = depute[0].string
deputes_list.add(nom_depute)
if not nom_depute in deputes_paroles.keys():
deputes_paroles[nom_depute] = []
speech = memo.get_text()
# We remove the name and title: "M. Nom Député (Parti). Bla bla blaaa"
startpos = re.search(p, speech[3:]).start()
deputes_paroles[nom_depute].append(speech[startpos+5:])
if not nom_depute in deputes_titres.keys():
deputes_titres[nom_depute] = []
deputes_titres[nom_depute].append(speech[:startpos+4])
else:
if speech[:startpos+4] not in deputes_titres[nom_depute]:
deputes_titres[nom_depute].append(speech[:startpos+4])
for o in range(0, 19): # year 2016
open_memorial('memoriaux/memoriaux' + str(o) + '.html')
print("Députés enregistrés:", len(deputes_list))
print("Speeches enregistrés:", sum([len(deputes_paroles[i]) for i in deputes_paroles]))
In [17]:
# tokenizer = nltk.data.load('tokenizers/punkt/PY3/french.pickle')
tokenizer = TreebankWordTokenizer()
french_stopwords = set(stopwords.words('french'))
french_stopwords.update(['les', 'alors', '(', ')', ',', '-', '.', 'M', 'M.', 'Mme', 'a', 'être'])
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation if char not in ["'"])
def analyze_text(text):
tokens = tokenizer.tokenize(text)
ntokens = [token for token in tokens if token.lower() not in french_stopwords]
ntokens = [token.translate(remove_punctuation_map) for token in ntokens]
ntokens = [token for token in ntokens if token != '']
fdist = FreqDist(ntokens)
sorted_x = sorted(fdist.items(), key=operator.itemgetter(1), reverse=True)
return sorted_x
# print(analyze_text(deputes_paroles['M. Patrick Lussi'][1])[:10]) # use this line to see a specific speech
# This prints our results before we do the json export:
for key in deputes_paroles.keys():
blob_text = str.join(' ', deputes_paroles[key])
nb_speeches = len(deputes_paroles[key])
if nb_speeches > 2:
print(str.join('|', deputes_titres[key]), "("+str(nb_speeches)+" prises de parole):")
result = analyze_text(blob_text)
for word, count in result[:15]:
if count > 2:
print(str(count) + "\t" + word)
print("\n")
In [20]:
# Preparation for our json: we transform the dictionary into a list of dictionaries
results_list = []
for key in deputes_paroles.keys():
blob_text = str.join(' ', deputes_paroles[key])
nb_speeches = len(deputes_paroles[key])
if nb_speeches > 2:
result = analyze_text(blob_text)
name = key
words = []
for word, count in result[:3]:
words.append(word)
results_list.append({'name': key, 'words': words})
results_list
Out[20]:
In [19]:
# We generate a json file containing the nodes data
import json
data = []
## return an array with the other nodes to connect to
def import_words(word_list):
lines = []
for word in word_list:
lines.append('Title.' + word)
return lines
def import_authors(author_list):
lines = []
for author in author_list:
lines.append('Author.' + author)
return lines
## create the node data
def generate_word(word, deputee):
timports = []
timports.extend(import_authors(deputee))
element = {
"name": "Title." + word,
"size": 0,
"imports": import_authors(deputee)
}
return element
def generate_author(deputee, words):
element = {
"name": "Author." + deputee,
"size": 0,
"imports": import_words(words)
}
return element
for item in results_list:
deputee_name = item['name']
for single_word in item['words']:
data.append(generate_word(single_word, [deputee_name]))
deputee = generate_author(deputee_name, item['words'])
data.append(deputee)
## export
with open('microdata2.json', 'w') as outfile:
json.dump(data, outfile, indent=4, separators=(',', ': '))
In [36]:
from IPython.core.display import display, HTML
print("Here is the result. Note: this is a work in progress -- at this time, I just use a modified D3 example!")
display(HTML('<iframe src="http://paulronga.ch/test/dataviz.html" width="900" height="900"></iframe>'))
# alternatively: iframe version:
# from IPython.display import IFrame
# display(IFrame('http://paulronga.ch/test/dataviz.html', width=900, height=900))
(External content is disabled on GitHub - use this link if nothing gets displayed)